# This script reproduces figures 18, 19, 20 # 

rm(list=ls())
options(digits=4)
pkg <- c("PanelMatch", "ggplot2", "plm", 
         "matrixStats",
         "foreign", 
         "stringi", "stringdist",
         "DataCombine","lmtest", "multiwayvcov",
         "data.table",
         "dplyr", 
         "stargazer",
         "clubSandwich", "mediation",
         "doBy")
lapply(pkg, require, character.only = TRUE)

# load datasets
load("other.RData")


big_list <- sets 
results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  # delete XJ and TB as consistent with Lv & Landry APSR
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(city_id) + as.factor(time)"))
  
  
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_nolag <- list("lower_rev_con_95" = lower_rev_con_95,
                      "upper_rev_con_95" = upper_rev_con_95,
                      "lower_rev_con_90" = lower_rev_con_90,
                      "upper_rev_con_90" = upper_rev_con_90,
                      "est_rev_con" = mean_estimates,
                      "se" = se_rev_con,
                      "r.squared" = rsquared,
                      "N" = n_obs)

results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_l", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = -1)
  
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_revn_l1 + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(city_id) + as.factor(time)"))
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_lag <- list("lower_rev_con_95" = lower_rev_con_95,
                    "upper_rev_con_95" = upper_rev_con_95,
                    "lower_rev_con_90" = lower_rev_con_90,
                    "upper_rev_con_90" = upper_rev_con_90,
                    "est_rev_con" = mean_estimates,
                    "se" = se_rev_con,
                    "r.squared" = rsquared,
                    "N" = n_obs)


results <- list(results_nolag, results_lag)

####
pdf(file = "output/Figure18.pdf",  
    width = 10, height = 8)
### Plotting Stuff ###
plot(x = 0, xlab = "", ylab = "Effect Estimates", 
     pch = "",
     ylim = c(-0.05, .01),
     xlim = c(0.95, 1.15),
     xaxt = "n",
     main = "Effects of Inspection \n on Fiscal Extraction Next Quarter")
lines(c(1,1), c(results[[1]]$lower_rev_con_95, 
                results[[1]]$upper_rev_con_95))
lines(c(1,1), c(results[[1]]$lower_rev_con_90, 
                results[[1]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1, results[[1]]$est_rev_con, pch = 19)
mtext(side = 1, "Not controlling for \n lagged outcomes", at = 1, 
      outer = F,
      line = 1.2)

lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_95, 
                    results[[2]]$upper_rev_con_95))
lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_90, 
                    results[[2]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1.1, results[[2]]$est_rev_con, pch = 19)
mtext(side = 1, "Controlling for \n lagged outcomes", at = 1.1, 
      outer = F,
      line = 1.2)

abline(h = 0, lty = 3)

dev.off()

#########
### with province year ###
big_list <- sets 
results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  

  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(province_id) * as.factor(year) +
                               as.factor(city_id) + as.factor(time)"))
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_nolag <- list("lower_rev_con_95" = lower_rev_con_95,
                      "upper_rev_con_95" = upper_rev_con_95,
                      "lower_rev_con_90" = lower_rev_con_90,
                      "upper_rev_con_90" = upper_rev_con_90,
                      "est_rev_con" = mean_estimates,
                      "se" = se_rev_con,
                      "r.squared" = rsquared,
                      "N" = n_obs)

results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_l", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = -1)
  
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_revn_l1 + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(province_id) * as.factor(year) +
                               as.factor(city_id) + as.factor(time)"))
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_lag <- list("lower_rev_con_95" = lower_rev_con_95,
                    "upper_rev_con_95" = upper_rev_con_95,
                    "lower_rev_con_90" = lower_rev_con_90,
                    "upper_rev_con_90" = upper_rev_con_90,
                    "est_rev_con" = mean_estimates,
                    "se" = se_rev_con,
                    "r.squared" = rsquared,
                    "N" = n_obs)


results <- list(results_nolag, results_lag)
pdf(file = "output/Figure19.pdf",  
    width = 10, height = 8)
### Plotting Stuff ###
plot(x = 0, xlab = "", ylab = "Effect Estimates", 
     pch = "",
     ylim = c(-0.05, .01),
     xlim = c(0.95, 1.15),
     xaxt = "n",
     main = "Effects of Inspection \n on Fiscal Extraction Next Quarter")
lines(c(1,1), c(results[[1]]$lower_rev_con_95, 
                results[[1]]$upper_rev_con_95))
lines(c(1,1), c(results[[1]]$lower_rev_con_90, 
                results[[1]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1, results[[1]]$est_rev_con, pch = 19)
mtext(side = 1, "Not controlling for \n lagged outcomes", at = 1, 
      outer = F,
      line = 1.2)

lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_95, 
                    results[[2]]$upper_rev_con_95))
lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_90, 
                    results[[2]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1.1, results[[2]]$est_rev_con, pch = 19)
mtext(side = 1, "Controlling for \n lagged outcomes", at = 1.1, 
      outer = F,
      line = 1.2)
abline(h = 0, lty = 3)

dev.off()

############
big_list <- sets

results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  
  tours <- tapply(data$tour, data$city_id, mean) 
  nts <- as.numeric(names(tours[tours == 0]) )
  data <- data[!data$city_id %in% nts, ]
  
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(city_id) + as.factor(time)"))
  
  
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_nolag <- list("lower_rev_con_95" = lower_rev_con_95,
                      "upper_rev_con_95" = upper_rev_con_95,
                      "lower_rev_con_90" = lower_rev_con_90,
                      "upper_rev_con_90" = upper_rev_con_90,
                      "est_rev_con" = mean_estimates,
                      "se" = se_rev_con,
                      "r.squared" = rsquared,
                      "N" = n_obs)

results <- list()
for (i in 1:length(big_list)) {
  
  data <- big_list[[i]]
  data <- data[data$province_id != 540000 & 
                 data$province_id != 650000, ]
  
  tours <- tapply(data$tour, data$city_id, mean) 
  nts <- as.numeric(names(tours[tours == 0]) )
  data <- data[!data$city_id %in% nts, ]
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_l", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = -1)
  
  
  data <- slide(data = data, Var = "log_revn",
                NewVar = paste0("log_revn", "_f", 1),
                TimeVar = "time", GroupVar = "city_id",
                slideBy = 1)  
  
  
  formula <- as.formula(paste0(paste0("log_revn","_f", 1),
                               "~ tour + log_revn_l1 + log_fia_l1 + log_fia_l1_missing + 
                               log_rdi_l1 + log_rdi_l1_missing + 
                               msec2currentsec_l1 + msec2currentsec_l1_missing + 
                               msec2currentgvn_l1 + msec2currentgvn_l1_missing + 
                               mayor2currentsec_l1 + mayor2currentsec_l1_missing + 
                               mayor2currentgvn_l1 + mayor2currentgvn_l1_missing + 
                               log_import_gdp_l1 +
                               log_export_gdp_l1 +
                               log_ypschool_l1  +
                               log_unemp_l1         +
                               log_wages_l1 +
                               log_total_gdp_l1 +
                               log_fdi_l1 +
                               log_gdppc_l1 + 
                               as.factor(city_id) + as.factor(time)"))
  
  
  mod.m1 <- lm(formula = formula,
               data = data)
  vcov.m1 <- clubSandwich::vcovCR(mod.m1,
                                  cluster = data$city_id, 
                                  type = "CR1S")
  results[[i]] <- c(coeftest(mod.m1, vcov.m1)["tour", "Estimate"],
                    coeftest(mod.m1, vcov.m1)["tour", "Std. Error"],
                    summary(mod.m1)$r.squared,
                    nrow(data))
}

mean_estimates <- mean(sapply(results, function(x) x[1]))
var_estimates <- var(sapply(results, function(x) x[1]))
se_rev_con <- sqrt(mean(sapply(results, function(x) x[2])^2) + var_estimates*(1+1/length(results)))
rsquared <- mean(sapply(results, function(x) x[3]))
n_obs <- mean(sapply(results, function(x) x[4]))

lower_rev_con_95 <- mean_estimates - qnorm(.975) * se_rev_con
upper_rev_con_95 <- mean_estimates + qnorm(.975) * se_rev_con

lower_rev_con_90 <- mean_estimates - qnorm(.95) * se_rev_con
upper_rev_con_90 <- mean_estimates + qnorm(.95) * se_rev_con


results_lag <- list("lower_rev_con_95" = lower_rev_con_95,
                    "upper_rev_con_95" = upper_rev_con_95,
                    "lower_rev_con_90" = lower_rev_con_90,
                    "upper_rev_con_90" = upper_rev_con_90,
                    "est_rev_con" = mean_estimates,
                    "se" = se_rev_con,
                    "r.squared" = rsquared,
                    "N" = n_obs)


results <- list(results_nolag, results_lag)
pdf(file = "output/Figure20.pdf",  
    width = 10, height = 8)
### Plotting Stuff ###
plot(x = 0, xlab = "", ylab = "Effect Estimates", 
     pch = "",
     ylim = c(-0.05, .01),
     xlim = c(0.95, 1.15),
     xaxt = "n",
     main = "Effects of Inspection \n on Fiscal Extraction Next Quarter")
lines(c(1,1), c(results[[1]]$lower_rev_con_95, 
                results[[1]]$upper_rev_con_95))
lines(c(1,1), c(results[[1]]$lower_rev_con_90, 
                results[[1]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1, results[[1]]$est_rev_con, pch = 19)
mtext(side = 1, "Not controlling for \n lagged outcomes", at = 1, 
      outer = F,
      line = 1.2)

lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_95, 
                    results[[2]]$upper_rev_con_95))
lines(c(1.1,1.1), c(results[[2]]$lower_rev_con_90, 
                    results[[2]]$upper_rev_con_90), 
      lwd = 5, col = "grey")
points(1.1, results[[2]]$est_rev_con, pch = 19)
mtext(side = 1, "Controlling for \n lagged outcomes", at = 1.1, 
      outer = F,
      line = 1.2)
abline(h = 0, lty = 3)

dev.off()
